FE8828 Programming Web Applications in Finance

Week 2
Intermediate R Programming
Shiny/2: R Web Framework
Data Manipulation and EDA (Exploratory Data Analysis)/1

Dr. Yang Ye <Email:yy@runchee.com>

Nov 5, 2018

Lecture 4: Intermediate R Programming

Let’s review some R basics.

R: Vector

Vector/Matrix/List/Dataframe

# Create a vector from number
v <- c(1, 3)
v[1] <- 3
v
## [1] 3 3
# repeat 100 for 10 times.
rep(100, 10)
##  [1] 100 100 100 100 100 100 100 100 100 100

R: Matrix

# create matrix of 10x10
mat <- matrix(2, 3, 4)
mat
##      [,1] [,2] [,3] [,4]
## [1,]    2    2    2    2
## [2,]    2    2    2    2
## [3,]    2    2    2    2
# set first row to 4
mat[1,] <- 4
# set element (2, 2) to 6
mat[2, 2] <- 6

Find element in Vector

data <- 10:1
match(c(1, 3), data)
## [1] 10  8
data[match(c(1, 3), data)]
## [1] 1 3
which(1 == data | 3 == data)
## [1]  8 10
data[which(1== data | 3 == data)]
## [1] 3 1

Check whether element exists

match(c(11, 31), 10:1)
## [1] NA NA
which(11== 10:1 | 31 == 10:1)
## integer(0)
if (all(c(1, 33) %in% 1:3)) {
  cat("Found all\n")
}

if (any(c(1, 33) %in% 1:3)) {
  cat("Found one/some.\n")
}
## Found one/some.

Random

# Norm random number
rnorm(3, mean = 10, sd = 3)
## [1] 8.665469 7.522488 7.338813
# Uniform random number
runif(3)
## [1] 0.1575791 0.3392511 0.2027836
# Sample
sample(1:10, 10, replace = F)
##  [1]  5  3  1  9 10  4  2  8  7  6
# To Be/Not to Be
sample(c(T, F), 10, replace = T)
##  [1] FALSE  TRUE FALSE  TRUE FALSE FALSE  TRUE  TRUE FALSE  TRUE
# Throw a dice
sample(1:6, 10, replace = T)
##  [1] 2 3 2 1 4 1 6 4 4 1

Print

x <- c(Sys.Date(), Sys.Date(), Sys.Date())
cat(paste0("Current dates is ", x, ".\n"))
## Current dates is 2018-10-28.
##  Current dates is 2018-10-28.
##  Current dates is 2018-10-28.
cat(paste0("Current dates is ", paste0(x, collapse = ", "), ".\n"))
## Current dates is 2018-10-28, 2018-10-28, 2018-10-28.

String

# sub-string
# substr(x, start, stop) 
substr("The fox jumps.", 6, 6 + 5 - 1)
## [1] "ox ju"
# paste0/paste to concatenate string/convert to string
new_string <- paste0("This is ", "cat")
new_string <- paste0("This is ", "cat", sep = "a")
new_string <- paste0(1:3, sep = "a")
# toupper/tolower
toupper("big")
## [1] "BIG"
tolower("LOWER")
## [1] "lower"

Find/Replace string in string

# grepl: Find, returns T or F
grepl("A", "ABC", fixed = T)
## [1] TRUE
grepl("D", "ABC", fixed = T)
## [1] FALSE
# sub: replace for one time
# sub(pattern, replace, string,...)
# fixed = T means use fixed string. Not regular expression
sub("D", "ABC", "DDD", fixed = T)
## [1] "ABCDD"
# gsub: replace for all
gsub("D", "ABC", "DDD", fixed = T)
## [1] "ABCABCABC"

Find/Replace String with Regular Expression (RE)

If you start to use regular expression, sub/grepl becomes super powerful.

# If we need to find `Start` appearing the beginning of the string
grepl("^Start", "Start with me")
## [1] TRUE
grepl("^Start", "me Start")
## [1] FALSE
# To find something in the end
sub("X$", "Z", "XYZ ends with X")
## [1] "XYZ ends with Z"

Match/Extraction with Regular Expression (RE)

Match (RE)

sub("[^\\_]+\\_.*", "", "USDCNY_M1")
## [1] ""

Match/Extraction with Regular Expression (RE)

Extraction (RE)

# Rough cut
sub("([^\\_]+)\\_.*", "\\1", "USDCNY_M1")
## [1] "USDCNY"
# Nice cut
sub("([^\\_]+)\\_(.*)", "\\1 \\2", "USDCNY_M1")
## [1] "USDCNY M1"
# Wonderful cut
sub("([^\\_]+)\\_([[:alpha:]])([[:digit:]])", "\\1 \\2 \\3", "USDCNY_M1")
## [1] "USDCNY M 1"

Cheatsheat is available at https://www.rstudio.com/resources/cheatsheets/

Date

# Create date
dt1 <- as.Date("2017-11-03")
dt1
## [1] "2017-11-03"
dt2 <- Sys.Date()
dt2
## [1] "2018-10-28"
library(lubridate)
# Date is such a central role in finance.
# More function about date can be found in package `lubridate`
# Create date with lubridate, a package which provides lots of date functions.
ymd(20171003)
## [1] "2017-10-03"
ymd("20171003")
## [1] "2017-10-03"

Date: format code

We can use codes for convert date to/from string.

format(Sys.Date(), format = "%Y/%m/%d")
## [1] "2018/10/28"
as.Date("2017-11-03", format = "%Y-%m-%d") # %m for number month
## [1] "2017-11-03"
as.Date("2017-Nov-03", format = "%Y-%b-%d") # %b for the 3-letter month
## [1] "2017-11-03"
as.Date("03Nov2017", format = "%d%b%Y")
## [1] "2017-11-03"

Other functions from lubridate

library(lubridate)
# Change a date
x <- as.Date("2017-10-10")
month(x) <- 1
x
## [1] "2017-01-10"
# Set to the end of the month
day(x) <- days_in_month(x)

Business days

Use package bizdays

# install.packages("bizdays")
library(bizdays)
# 'weekends' is a calendar of weekdays
bizdays("2017-10-16", "2017-10-30", "weekends")
## [1] 10

# add bizdays
add.bizdays("2017-11-03", 5, "weekends")
## [1] "2017-11-10"

# generate all business days between.
# You will find this useful for later financial application.
bizseq("2017-11-10", "2017-12-01", "weekends")
##  [1] "2017-11-10" "2017-11-13" "2017-11-14" "2017-11-15" "2017-11-16"
##  [6] "2017-11-17" "2017-11-20" "2017-11-21" "2017-11-22" "2017-11-23"
## [11] "2017-11-24" "2017-11-27" "2017-11-28" "2017-11-29" "2017-11-30"
## [16] "2017-12-01"

Calendar

If not provided, start.date is by default the first holiday and end.date is the last holiday. So we provide them here.

create.calendar(name="Singapore", holidays = c(as.Date("2017-10-18")),
                start.date = as.Date("2017-01-01"), end.date = as.Date("2071-01-01"),
                weekdays = c("saturday", "sunday"))

bizdays("2017-10-13", "2017-10-30", "weekends")
## [1] 11
# One day less
bizdays("2017-10-13", "2017-10-30", "Singapore")
## [1] 10

Write function

Input parameters

func1 <- function() { }

func2 <- function(input1, input2) { }

# Param input1 is default to 1
func3 <- function(input1 = 1, input2) { }

func4 <- function(input1, input_type = c("int", "char"))
{
  # This would check wheher input_type is set to one of the pre-set values.
  input_type = match.arg(input_type) 
}

func5 <- function(in1, in2) {
  if (in1 < 0) {
    return(0)
  } else {
    return(in1 + in2)
  }
}

Write function


# The last value before function finishes will be returned automatically. No need to use return.
func5 <- function(in1, in2) {
  if (in1 < 0) {
    0
  } else {
    in1 + in2
  }
}

# Unless there is extra steps before
func6 <- function(in1, in2) {
  if (in1 < 0) {
    return(0) # if we have 0 here, it's not the last step before function exits.
  } else {
    res <- in1 + in2
  }

  res <- res * 3
  res
}

Exercise

Write functions to do

Time

Convert time to character/string

format(Sys.time(), format = "%H%M")
## [1] "1531"
format(Sys.time(), format = "%H:%M:%S")
## [1] "15:31:31"
format(Sys.time(), format = "%H:%M:%S")
## [1] "15:31:31"
library(lubridate)
ymd_hms("2011-12-31 12:59:59")
## [1] "2011-12-31 12:59:59 UTC"

Time

Change time, lubridate provides hour, minute

x <- Sys.time()
x
## [1] "2018-10-28 15:31:31 +08"
hour(x) <- 12
x
## [1] "2018-10-28 12:31:31 +08"
minute(x) <- 3
x
## [1] "2018-10-28 12:03:31 +08"
minute(x) <- 123 # what will happen?
x
## [1] "2018-10-28 14:03:31 +08"

List

# Create list with list() function
# Nameless list
# list[_n_] => item by order
a <- list(3, 4)
a[[1]]
## [1] 3
a[[2]]
## [1] 4

# Named list, you can use $ and [ operators
# list[[]]: gives back a value
# list$name => list[["name"]]
a <- list(a = 3, b = 4)
a[[1]]
## [1] 3
a[[2]]
## [1] 4
a[["a"]]
## [1] 3
a$a
## [1] 3

List

# When you want to use a number as key, use backtick
list_of_strikes <- list()
list_of_strikes$`65` <- 3
list_of_strikes$`60` <- 4

# if a name doesn't exist in the list
a$c
## NULL
# Use `is.null()` to check
if (is.null(a$c)) {
  cat("c doesn't exist in list a\n")
}
## c doesn't exist in list a

List

list <- list(elem = 1, c1 = "a", c2 = "b")

# access the list
list[[1]]
list$elem
# add new member to the list
list$new_elem <- 3
# update member in the list
list$c1 <- 3

# Delete c1 in the list
list$c1 <- NULL

list
## $elem
## [1] 1
## 
## $c2
## [1] "b"
## 
## $new_elem
## [1] 3

List’s Usage - 1

# List can be used as map/dictionary.
# Map
basket <- sample(c("Apple", "Orange", "Pear"), 100, replace = TRUE)
fruit_count <- list()
for (b in basket) {
  if (is.null(fruit_count[[b]])) {
    fruit_count[[b]] <- 1
  } else {
    fruit_count[[b]] <- fruit_count[[b]] + 1
  }
}
fruit_count
## $Orange
## [1] 31
## 
## $Pear
## [1] 33
## 
## $Apple
## [1] 36

List’s Usage - 2

# Let's write a generic function to do this
add_to_map <- function(map, key, value) {
  if (is.null(map[[key]])) {
    map[[key]] <- value
  } else {
    map[[key]] <- map[[key]] + value
  }
  map
}

# You may copy function add_to_map to every file that you want to use this kind of dictionary
fruit_count <- add_to_map(fruit_count, "Pomelo", 12)
fruit_count
## $Orange
## [1] 31
## 
## $Pear
## [1] 33
## 
## $Apple
## [1] 36
## 
## $Pomelo
## [1] 12

List’s Usage

# Use case 1: Use list to pass data in or out.
do_lots_of_work <- function(a, b, c) {
}
# pass in
do_lots_of_work <- function(lst) {
  lst$a + lst$b
}
# pass out
ret_lots_of_work <- function() {
  return(list(a = a, b = b))
}

res <- ret_lots_of_work()
res$a
## $a
## [1] 3
## 
## $b
## [1] 4
res$b
## [1] "Pear"
# Case 2: configuration
app_config <- list(MAX = 10, MIN = 10, DISPLAY_RESULT = T)

do_lots_of_work <- function(app_config) {
  app_config$MAX
}

Object

# Object
# Define class with attributes.
vanilla_option <- setClass("vanilla_option",
                           slots = c(type = "character",
                                     strike = "numeric",
                                     underlying = "numeric"))
# Create object, either way
opt1 <- new("vanilla_option", type = "c", strike = 100, underlying = 100)
opt2 <- vanilla_option(type = "c", strike = 100, underlying = 100)

# Use @ to visit member. or,
opt1@type
## [1] "c"
slot(opt1, "strike")
## [1] 100

Work with objects

# Generate a vector of options
opts <- sapply(1:10000, function(x) { 
                       vanilla_option(type = sample(c("c", "p"), 1),
                                      strike = round(runif(1) * 100, 0),
                                      underlying = round(runif(1) * 100, 0)) } )

# install.packages("fOptions")
library(fOptions)

start <- Sys.time()
# GBSOption also returns an object. We just need its price attribute.
res1 <- sapply(opts, function(o) {
  (GBSOption(o@type, o@underlying, o@strike, Time = 1,
             r = 0.01, b = 0, sigma = 0.3))@price
})
cat(as.numeric(Sys.time() - start))
## 3.997
head(res1, n = 4)
## [1] 2.077787e-05 7.722389e+01 1.197809e+01 3.265974e-04

# Alternatively to sapply, we can use map* functions from purrr package
# map is a generic function that returns a list
# map_dbl is for result of double, it would return a vector
res2 <- purrr::map_dbl(opts, function(o) {
  (GBSOption(o@type, o@underlying, o@strike, Time = 1,
             r = 0.01, b = 0, sigma = 0.3))@price
})
head(res2, n = 4)
## [1] 2.077787e-05 7.722389e+01 1.197809e+01 3.265974e-04

Anonymous Function

# Function that's defined in-place, which doesnt' need to have a name.
(function(x) { print(x) }) (3)
## [1] 3
# if there is only one line, you can skip { }
(function(x) print(x)) (3)
## [1] 3

# For longer functions, you can make it multi-lines.
(function(x) {
  if (x > 3) {
    print(x)
  } else {
    print(x - 3)
  }
})(3)
## [1] 0

purrr::map and sapply Function

# These two are equivalent.
res1 <- purrr::map(1:10, function(x) { rnorm(x, n = 10) })
# function(x) func(x) can be simplied as func.
res2 <- purrr::map(1:10, rnorm, n = 10)
head(res1, n = 1)
## [[1]]
##  [1]  0.5493080 -1.6168302  1.4921084  1.8161012  0.6206859  1.4630323
##  [7]  0.6399128  1.8858877  1.8166412  0.1557157
# purrr:map returns a list()
# This is what we really want to do. Generate ten normal distribution and get their mean.
# rnorm(n, mean = 0, sd = 1). Where doesn't input go to?
res <- purrr::map(1:10, rnorm, n = 1000)
map_dbl(res, mean)
##  [1] 0.9672962 2.0462016 2.9667426 4.0093552 5.0334546 6.0358025 6.9581536
##  [8] 7.9845969 8.9603169 9.9838452

# sapply achieves the same as purrr::map, a bit slower.
# Package purrr succeeds original R base.
sapply(1:10, function(x) x ^ 2 )
##  [1]   1   4   9  16  25  36  49  64  81 100
sapply(1:10, function(x) `^`(x, 2) )
##  [1]   1   4   9  16  25  36  49  64  81 100
sapply(1:10, function(x) `^`(2, x) )
##  [1]    2    4    8   16   32   64  128  256  512 1024

Read/Write data

# set working directory
setwd("C:/TEMP")
# Save this_is_var1 to a file
saveRDS(this_is_var1, file = "C:/TEMP/DATA/data.Rds")
# Load a variable from a file. `new_loaded` is the name given to it.
new_loaded <- readRDS(file = "C:/TEMP/DATA/data.Rds")

R: data frame

The basic structure of a data frame:

df <- data_frame(
  date = seq(as.Date("2017-01-01"), as.Date("2017-01-10"), by = "day"),
  stock = replicate(10, paste0(sample(LETTERS, 3, replace = T), collapse = "")),
  quantity = round(runif(10) * 10000 ,0))
# df["date"]: gives a data frame
# df[["date"]]: gives value
# df$date: same as [["date"]]

# Get three rows
df[c(3, 6, 9), , drop = F]
# Get three columns
df[, 1, drop = F]
# This would return a vector
df[, 1, drop = T]
##  [1] "2017-01-01" "2017-01-02" "2017-01-03" "2017-01-04" "2017-01-05"
##  [6] "2017-01-06" "2017-01-07" "2017-01-08" "2017-01-09" "2017-01-10"

# Use column names
df[, c("date", "quantity"), drop = F]

R: data frame

Common functions for data frame

View()
head()
tail()
str()
nrow()
ncol()
dim() # returns both nrow and ncol
colnames()/rownames()

Birthday Problem

## [1] 15
## [1] 108
## [1] 37.738

Fastest Fish Problem

## res_sim: 2.9236
## res_ana: 2.92896825396825

Lecture 5: Shiny/2: R Web Framework

Minimalist

library(shiny)
ui <- fluidPage("Hello World")
server <- function(input, output, session) { }
shinyApp(ui = ui, server = server)

Think around Input and Outputs

ui <- fluidPage(
  titlePanel("Hello World with a Histogram"),
  # Input() functions
  numericInput("num", "Number of Sample", value = 30),
  # Output() functions
  plotOutput("hist")
)

Input

All input function follow such function signature except for input-specific parameters.

inputXXX(inputId = "input name", label = "label to display", ...)

Reference: https://shiny.rstudio.com/reference/shiny/1.0.5/

Output

All output function follow such pattern.

yyyOutput(outputId = "output name")

plotOutput: I suggest to set width and height to fixed size so we need extra parameters. For others, outputId is good enough.

Server

Sever is to fill the content of output

server <- function(input, output, session) {
  # Enable either one of two
  output$hist <- renderPlot({ hist(rnorm(100)) })
  
  if (FALSE) {
    output$hist <- renderPlot({
      title("a normal random number histogram")
      hist(rnorm(input$num))
    })
  }
}

shinyApp = UI + Server

UI and Server combines to be a ShinyApp. UI is to run the same for each browser/client. Server is separate between different users.

shinyApp(ui, server)

Reactivity Kicks In

Reactive values work together with reactive functions.

  1. Reactive function responds. input$x => output$y
  2. Reactive value notifies. input$x => expression() => output$y

Reactivity - 1

Reactivity is enabled by placing inputXXX in renderXXX function.

library(shiny)

ui <- fluidPage(
  numericInput("num", "Num", 100),
  # numericInput("mean", "Mean", 5),
  # numericInput("sd", "SD", 3),
  numericInput("lambda", "Lambda", 1),
  plotOutput("p1")
)

server <- function(input, output, session) {
  output$p1 <- renderPlot({
    # hist(rnorm(input$num, mean = input$mean, sd = input$sd))
    hist(rpois(n = input$num, lambda = input$lambda))
  })
}

shinyApp(ui, server)

Reactivity - 2

library(shiny)

ui <- fluidPage(
  numericInput("num", "Num", 10),
  actionButton("go", "Go"),
  plotOutput("p1")
)

server <- function(input, output, session) {
  observeEvent(input$go, {
    output$p1 <- renderPlot({
      hist(rnorm(isolate(input$num)))
    }) 
  })
}

shinyApp(ui, server)

Reactivity - 3

We can add a reactiveValue with eventReactive

library(shiny)

ui <- fluidPage(
  numericInput("num", "Num", 10),
  actionButton("go", "Go"),
  plotOutput("p1")
)

server <- function(input, output, session) {
  data <- eventReactive(input$go, {
    hist(rnorm(input$num))
  })

  output$p1 <- renderPlot({ data() })
}

shinyApp(ui, server)

Output

For tableOutput

output$t1 <- renderTable(iris)

output$t1 <- renderTable({
  some input..
  output is a data frame.
})

For dataTableOutput (Dynamic table)

output$t2 <- renderDataTable(iris)

For plotOutput

output$p2 <- renderPlot({ plot(runif(1000), runif(1000)) })

For textOutput and verbatimTextOutput

output$t3 <-  renderText({ "foo" })
output$t4 <- renderPrint({
  print("foo")
  print("bar")
})

Example: Shiny-24

library(shiny)
library(DT)

ui <- fluidPage(
  h3("t1"),
  tableOutput("t1"),
  hr(),
  fluidRow(
    column(9, h3("dt1"),
           dataTableOutput("dt1")),
    column(3,   h3("x4"),
           verbatimTextOutput("x4"))),
  hr(),
  fluidRow(
    column(8, h3("dt2"),
           dataTableOutput("dt2")),
    column(4, h3("p5"),
              plotOutput("p5")))
)

options(error = function() traceback(2))

server <- function(input, output, session) {
  output$t1 <- renderTable(iris[1:10,], striped = T, hover = T)
  output$dt1 <- renderDataTable(iris, options = list( pageLength = 5))
  output$x4 <- renderPrint({
      s = input$dt1_rows_selected
      if (length(s)) {
        cat('These rows were selected:\n\n')
        cat(s, sep = ', ')
      }
    })    
    
  output$dt2 <- renderDataTable(iris,
                                options = list(pageLength = 5),
                                server = FALSE)
  output$p5 <- renderPlot({
    s <- input$dt2_rows_selected
    plot(iris$Sepal.Length, iris$Sepal.Width)
    if (length(s)) {
      points(iris[s, c("Sepal.Length", "Sepal.Width"), drop = F],
             pch = 19, cex = 1, col = "red") 
    }
  })
}

shinyApp(ui, server)

Debug Shiny

Shiny Summary

Shiny Assignment

  1. Add a selectInput for different color names, returned from colors().

    plot(1:10, pch = 19, cex = 1, col = “skyblue1”)

  2. Create a Bond Schedule

\(NPV = \frac{Cashflow 1}{(1 + yield)^1} + \frac{Cashflow 2}{(1 + yield)^2} + ... + \frac{Last Cashflow}{(1 + yield)^n}\)

For a Bond with fixed coupon \(Bond Price = Coupon * \frac{1 - (\frac{1}{(1 + yield)^n})}{yield} + \Big[MaturityValue * \frac{1}{(1 + yield)^n}\Big]\)

Lecture 6: Data Manipulation and EDA (Exploratory Data Analysis)/1

Tidyverse

install.packages(“tidyverse”)

SQL

Let’s start from SQL, which first appeared in 1974; 43 years ago.

CRUD: Create Read Update Delete

Data engineering was born around 70s with SQL.

SQL does CRUD

# Select everything from Shops.
SELECT * FROM Shops;

# Select with a filter
SELECT * FROM Shops WHERE size = "Big";

# Select with a filter and order
SELECT * FROM Shops WHERE size = "Big" ORDER BY Name;

# Select with a filter, order, group and summary function `sum`
SELECT Region, sum(Sales) FROM Shops WHERE size = "Medium" GROUP BY Region;

# Insert a new record to Shops.
INSERT into Shops (Name, Region, Sales) VALUES ("Costco", "North", 123456, ...);

# Update a field
UPDATE Shops SET Sales = Sales + 1000 WHERE Name = "Costco";

# Delete from Shops with a filter
DELETE from Shops WHERE Sales < 1000

Data frame does CRUD

df <- data.frame(a = 1:10, b = 10:1)
# Filter:
df[which(df$a == 3 | df$b == 3), , drop = T] 
df[match(3, df$a), , drop = T]
## $a
## [1] 3
## 
## $b
## [1] 8
df[, match("b", colnames(df)), drop = T] 
##  [1] 10  9  8  7  6  5  4  3  2  1

# Insert
rbind(df, df)

# Delete
df[-(which(df$a == 3 | df$b == 3)), , drop = T]

# Update
df[which(df$a == 3 | df$b == 3), 2] <- 3

dplyr

dplyr package from tidyverse is a high-performance package to deal with data frame.

# tidyverse is a bundle of packages.
# I usually load them all with library(tidyverse, instead of library(dplyr) individually.
library(tidyverse)
# Loading tidyverse: ggplot2
# Loading tidyverse: tibble
# Loading tidyverse: tidyr
# Loading tidyverse: readr
# Loading tidyverse: purrr
# Loading tidyverse: dplyr

# Note:
# filter(): dplyr, stats
# lag():    dplyr, stats
# Use dplyr::lag and dplyr::filter when it doesn't work.

How dplyr works

dplyr provides functions in “verbs”, which is functions that does one thing only. We will learn to use the following.

Sample dataset

A data-driven approach to predict the success of

telemarketing ### Author: Sérgio Moroa; Paulo Cortezb; Paulo Ritaa #### http://dx.doi.org/10.1016/j.dss.2014.03.001

I chose this data set of a Portuguese retail bank clients profile

Sample dataset columns

variable = column = field

Personal profile

Other attributes:

Output variable (desired target):

Read data

Use RStudio’s File -> Import Dataset, you may choose either “From Text (base)” or “From Text (readr)”. Either way loads the data.

base comes with R. readr is a package from tidyverse that provides more options and functionality. Copy the generated code to your script file.

I place it at https://goo.gl/fFQAAm (for Download), https://goo.gl/PBQnBt (for direct use).

You may download it and save it to local.

# Use base
bank <- read.csv("example/data-bank/bank.csv", sep=";") # or,
bank <- read.csv("https://goo.gl/PBQnBt", sep = ";")

# use readr
library(readr)
bank <- read_delim("example/data-bank/bank.csv", 
                    ";", escape_double = FALSE, trim_ws = TRUE)
## Parsed with column specification:
## cols(
##   age = col_integer(),
##   job = col_character(),
##   marital = col_character(),
##   education = col_character(),
##   default = col_character(),
##   balance = col_integer(),
##   housing = col_character(),
##   loan = col_character(),
##   contact = col_character(),
##   day = col_integer(),
##   month = col_character(),
##   duration = col_integer(),
##   campaign = col_integer(),
##   pdays = col_integer(),
##   previous = col_integer(),
##   poutcome = col_character(),
##   y = col_character()
## )
View(bank)

select

select(df, ...), … can be

select - Examples

subset <- select(bank, marital)
subset <- select(bank, 1)
subset <- select(bank, -1)
subset <- select(bank, -job)
subset <- select(bank, -(job:education))
subset <- select(bank, starts_with("p"))
subset <- select(bank, ends_with("p"))
subset <- select(bank, contains("p"))

select as a re-arrangement of columns.

job_first <- select(bank, job, everything())
bank

filter

colnames(bank)
##  [1] "age"       "job"       "marital"   "education" "default"  
##  [6] "balance"   "housing"   "loan"      "contact"   "day"      
## [11] "month"     "duration"  "campaign"  "pdays"     "previous" 
## [16] "poutcome"  "y"

young <- dplyr::filter(bank, age < 40)
another_young <- dplyr::filter(bank, age < 20 & marital == "married")
just_young <- dplyr::filter(bank, age < 20 & marital == "single")

young2 <- dplyr::filter(bank, age >= 20 & age < 30)
another_young2 <- dplyr::filter(bank, age >= 20 & age < 30 & marital == "married")
just_young2 <- dplyr::filter(bank, age >= 20 & age < 30 & marital == "single")

filter - logic operators

filter - string operations

# %in% to match multiple
second_upper <- dplyr::filter(bank, education %in% c("tertiary", "secondary"))

# filter out NA value.
no_na <- dplyr::filter(bank, !is.na(balance) & balance > 0)

Exercise

rename

# rename(new name = old)
# Use tick to quote special strings.
df <- rename(bank, young_age = age) 
df <- rename(bank, `Age in Bank` = age)

arrange

# arrange is sort
arrange(bank, job)
arrange(bank, default, job)

# descending for day
arrange(bank, desc(day))
arrange(bank, desc(as.Date(day, format="%d", origin = Sys.Date())))

NB: Missing values are always sorted at the end.

Exercise

mutate

# Replace existing
# ifelse is to check condition.
df1 <- mutate(bank, y = ifelse(y == "yes", T, F))

# Add a new column.
df2 <- mutate(bank, duration_diff = duration - mean(duration, na.rm = TRUE))

# case_when is a function to deal multiple choices.
df2_age_group <- mutate(bank, age_group = case_when(
  age < 20 ~ "youth",
  age < 40 ~ "middle-age",
  age < 50 ~ "senior",
  TRUE ~ "happy"
))

df2_age_group_res <-
  group_by(df2_age_group, age_group) %>%
  summarise(mean_age = mean(age)) %>%
  transmute(mean_age_diff = mean_age - lag(mean_age))

mutate/2

firstup <- function(x) {
  substr(x, 1, 1) <- toupper(substr(x, 1, 1))
  x
}

# month.abb is a built-in array of month names.
df3 <- mutate(bank, month_name = factor(firstup(as.character(month)), levels = month.abb))

# transmute would remove all other columns after mutation, only keeping the new variable.
df5 <- transmute(bank, 
                  duration_trend = duration - mean(duration, na.rm = TRUE),
                  balance_trend = balance - mean(balance, na.rm = TRUE))

What you can do with mutate

y <- c(1, 2, 2, NA, 3, 4)
row_number(y)
## [1]  1  2  3 NA  4  5
min_rank(y)
## [1]  1  2  2 NA  4  5
ntile(y, 2)
## [1]  1  1  1 NA  2  2

%>%

We may write such code.

df <- select(df, x)
df <- mutate(df, a = 1)
df <- rename(df, a = b)
df <- arrange(df, x)

# This is effectively,
arrange(rename(mutate(select(df, x), a = 1), a = b), x)

third(second(first(x)))

How about this?

df %>% select %>% mutate %>% rename %>% arrange

%>% Benefits

%>% operator allows you to transform the flow from nesting to left-to-right fashion, i.e.

first(x) %>% second() %>% third()

x %>% first() %>% second() %>% third() # this could also do.

x %>% first(.) %>% second(.) %>% third(.) # . represents the input

What’s the output of below?

c(1, 3, 7, 9) %>% {
  print(.)
  mean(.)
} %>% { . * 3 } %>% {
  print(.)
  sample(round(., 0))
}
## [1] 1 3 7 9
## [1] 15
##  [1] 12 11  3  8  4 14  9 15  1  6  2 13 10  5  7

Work with Pipe

%>% … %>%

# Feed the data for multiple processing
{
  v <- .
  cn <- colnames(v)

  v <- select(v, u, z)
  colnames(v) <- cn[1:3]
  v
} 

# How to return multiple value

%>% {
  assign("data_name", data, envir = parent.env(environment()) )
} %>% {
  select(., z < 0.4)
}

# or, we use list
%>% {
  list(a, b)
}  %>% {
  v <- .
  v$a
  v$b
}

Code pattern with Pipe

df %>%
... %>%
... %>%
... %>%
{
  v <- .
  ggplot(data = v) + 
    # full data is used here
    geom_line(data = v) +
    # partial data needs to be hightlighted.
    geom_line(data = filter(., some condition), color = "red")
}

Use of Caution

Pros:

Cons:

Environment

Environment is where your data resides. Use local() to isolate.

# local stores the data wihtin the boundary of {}
x <- 3
local({
  print(x)
  x <- 1
  print(x)
})
## [1] 3
## [1] 1
print(x)
## [1] 3
# local stores the nearest environment
x <- 3
{
  print(x)
  x <- 1
  print(x)
}
## [1] 3
## [1] 1
x
## [1] 1
get_sum <- function(i) {
  v <- 0
  for (i in 1:10) {
    v <- v + i
  }
  v
}

get_sum(10)
## [1] 55

# object 'v' not found
v
## [1] 3 3

Environment

Use assign() to do space-jump.

# assign data to global environment
x <- 1
pass_out_global <- function() {
  assign("x", 3, envir = .GlobalEnv)  
}

# assign data to just one level up 
pass_out <- function(env) {
  print(env)
  assign("x", 2, envir = env)
}
x <- 1
pass_out(environment())
## <environment: R_GlobalEnv>
x
## [1] 2

# assign data to pass it out of function
extra_layer <- function(env) {
  pass_out(env)
}

x <- 1
extra_layer(env = environment())
## <environment: R_GlobalEnv>
x
## [1] 2

extra_layer_g <- function() {
  pass_out_global()
}

x <- 1
extra_layer_g()
x
## [1] 3

Summary